在上一篇最終定義出最終結果希望為
<?php
namespace Recca0120\Ithome30\Tests;
use Mockery;
use GuzzleHttp\Client;
use GuzzleHttp\Psr7\Request;
use PHPUnit\Framework\TestCase;
use Recca0120\Ithome30\PttCrawler;
use Psr\Http\Client\ClientInterface;
class PttCrawlerTest extends TestCase
{
public function test_fetch_board_page()
{
\VCR\VCR::turnOn();
\VCR\VCR::insertCassette('ptt_home.yaml');
/** @var Mockery\Mock|ClientInterface $httpClient */
$httpClient = Mockery::spy(new Client());
$crawler = new PttCrawler($httpClient);
$records = $crawler->all();
self::assertEquals([
'board_name' => 'Gossiping',
'board_class' => '綜合',
'nrec' => '4',
'type' => '協尋',
'title' => '9/30 10:15員林回春中醫診所前車禍行車',
'author' => 'tyujm',
'date' => '10/3',
'url' => 'https://www.ptt.cc/bbs/Gossiping/M.1696296400.A.A9B.html',
], $records[0]);
$httpClient->shouldHaveReceived('sendRequest')->once()->with(Mockery::on(function (Request $request) {
return (((string)$request->getUri()) === 'https://www.ptt.cc/bbs/hotboards.html');
}));
\VCR\VCR::eject();
\VCR\VCR::turnOff();
}
}
所以我們的主程式可能就會這樣寫 ...
<?php
// src/PttCrawler.php
namespace Recca0120\Ithome30;
use Psr\Http\Client\ClientInterface;
use Recca0120\Ithome30\Crawlers\Home;
class PttCrawler
{
public function __construct(private ClientInterface $httpClient)
{
}
public function all()
{
$crawler = new Home($this->httpClient);
$boardCrawler = new Board($this->httpClient);
$results = [];
foreach ($crawler->all() as $board) {
foreach ($boardCrawler->fetch($board) as $articles) {
$results[] = $articles;
}
}
return $results;
}
}
如果持續這樣寫下去,我們的測試就會變成大雜燴,當其中一個環節出錯時,我們會變的很難除錯,那要怎麼解決這個困境呢?我們直接再另外寫一個抓取看版列表的測試,讓我們可以只專注抓取看版列表的開發上,所以我們就可以先寫出這樣的測試案例
<?php
<?php
namespace Recca0120\Ithome30\Tests\Cralwers;
use Mockery;
use GuzzleHttp\Client;
use PHPUnit\Framework\TestCase;
use Recca0120\Ithome30\Crawlers\Board;
class BoardTest extends TestCase
{
public function test_fetch_board_articles_list()
{
\VCR\VCR::turnOn();
\VCR\VCR::insertCassette('ptt_board_gossiping.yaml');
/** @var Mockery\Mock|ClientInterface $httpClient */
$httpClient = Mockery::spy(new Client());
$board = new Board($httpClient);
$records = $board->fetch([
'name' => 'Gossiping',
"nuser" => '8803',
'class' => '綜合',
'title' => '[八卦] 亞運李智凱、許皓鋐奪金!',
'url' => 'https://www.ptt.cc/bbs/Gossiping/index.html'
]);
self::assertEquals([
'board_name' => 'Gossiping',
'board_class' => '綜合',
'nrec' => '4',
'type' => '問卦',
'title' => '保全可以睡覺滑手機那擺個稻草人就好了',
'author' => 'Ommmmmm5566',
'date' => '10/05',
'url' => 'https://www.ptt.cc/bbs/Gossiping/M.1696519188.A.5C8.html',
], $records[0]);
\VCR\VCR::eject();
\VCR\VCR::turnOff();
}
}
測試案例出來後,我們就可以進行 Board 的實作
<?php
namespace Recca0120\Ithome30\Crawlers;
use GuzzleHttp\Psr7\Request;
use Psr\Http\Client\ClientInterface;
class Board
{
public function __construct(private ClientInterface $httpClient)
{
}
public function fetch(array $board)
{
$request = new Request('GET', $board['url'], [
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding' => 'gzip, deflate, br',
'Accept-Language' => 'zh-TW,zh;q=0.8',
'Cache-Control' => 'max-age=0',
'Cookie' => 'over18=1',
'Referer' => 'https://www.ptt.cc/bbs/Gossiping/index.html',
'Sec-Ch-Ua' => '"Brave";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'Sec-Ch-Ua-Mobile' => '?0',
'Sec-Ch-Ua-Platform' => '"macOS"',
'Sec-Fetch-Dest' => 'document',
'Sec-Fetch-Mode' => 'navigate',
'Sec-Fetch-Site' => 'same-origin',
'Sec-Fetch-User' => '?1',
'Sec-Gpc' => '1',
'Upgrade-Insecure-Requests' => '1',
'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
]);
$response = $this->httpClient->sendRequest($request);
$html = (string)$response->getBody();
preg_match_all('/class="r-ent">.+<div class="mark">(.+)<\/div>/sU', $html, $matches);
$results = [];
foreach ($matches[0] as $row) {
preg_match_all('/<div class="(?<name>(nrec|title|author|date))"[^>]*>(?<value>.*?)<\/div>/s', $row, $matches2);
$temp = [];
foreach (array_keys($matches2[0]) as $index) {
$temp[$matches2['name'][$index]] = trim($matches2['value'][$index]);
}
preg_match('/href="(.*)"/', $temp['title'], $matched);
$temp['url'] = 'https://www.ptt.cc' . $matched[1];
preg_match('/\[(.+)\](.+)/', strip_tags($temp['title']), $matched);
$temp['type'] = trim($matched[1]);
$temp['title'] = trim($matched[2]);
$temp['nrec'] = strip_tags($temp['nrec']);
$temp['board_name'] = $board['name'];
$temp['board_class'] = $board['class'];
$results[] = $temp;
}
return $results;
}
}
測試得到綠燈後,我們再進行重構
<?php
namespace Recca0120\Ithome30\Crawlers;
use GuzzleHttp\Psr7\Request;
use Psr\Http\Client\ClientInterface;
class Board
{
public function __construct(private ClientInterface $httpClient)
{
}
public function fetch(array $board)
{
$request = new Request('GET', $board['url'], [
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding' => 'gzip, deflate, br',
'Accept-Language' => 'zh-TW,zh;q=0.8',
'Cache-Control' => 'max-age=0',
'Cookie' => 'over18=1',
'Referer' => 'https://www.ptt.cc/bbs/Gossiping/index.html',
'Sec-Ch-Ua' => '"Brave";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'Sec-Ch-Ua-Mobile' => '?0',
'Sec-Ch-Ua-Platform' => '"macOS"',
'Sec-Fetch-Dest' => 'document',
'Sec-Fetch-Mode' => 'navigate',
'Sec-Fetch-Site' => 'same-origin',
'Sec-Fetch-User' => '?1',
'Sec-Gpc' => '1',
'Upgrade-Insecure-Requests' => '1',
'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
]);
$response = $this->httpClient->sendRequest($request);
$html = (string)$response->getBody();
return array_map(
fn (string $row) => $this->parseCols($row, $board),
$this->parseRows($html)
);
}
private function parseCols($row, $board)
{
preg_match_all('/<div class="(?<name>(nrec|title|author|date))"[^>]*>(?<value>.*?)<\/div>/s', $row, $matches);
$cols = [
'board_name' => $board['name'],
'board_class' => $board['class'],
];
foreach (array_keys($matches[0]) as $index) {
$cols[$matches['name'][$index]] = trim($matches['value'][$index]);
}
$cols['nrec'] = strip_tags($cols['nrec']);
preg_match('/href="(.*)"/', $cols['title'], $matched);
$cols['url'] = 'https://www.ptt.cc' . $matched[1];
preg_match('/\[(.+)\](.+)/', strip_tags($cols['title']), $matched);
$cols['type'] = trim($matched[1]);
$cols['title'] = trim($matched[2]);
return $cols;
}
private function parseRows($html)
{
preg_match_all('/class="r-ent">.+<div class="mark">(.+)<\/div>/sU', $html, $matches);
return $matches[0];
}
}
這樣就完成 Board 的程式了